knitr::opts_chunk$set(echo = TRUE)
#define working directory
knitr::opts_knit$set(root.dir = '/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step3_DNA_BC_processing/all_samples/freqthresh_1/')
rm(list = ls())
#define path
path = "/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step3_DNA_BC_processing/all_samples/freqthresh_1/"
setwd(path)
library(ggplot2)
library(plyr)
library(reshape2)
library(stringr)
Load the demultiplexed reads for each sample and combine them into a single data frame. In this format the columns are samples, and the rows are barcodes.
mergedData <- mergeSamples(path)
#get rid of mice where we dont have technical replicates or both M and E samples
mergedData <- mergedData[,!grepl("534",colnames(mergedData))]
mergedData <- mergedData[,!grepl("M454_M_",colnames(mergedData))]
For each sample normalise the reads by dividing each barcode by the sample total. With this transformation all reads in a given sample sum to 1
norm.data <- normaliseData(mergedData)
some barcodes are not very common and may be the result of PCR or sequencing errors and do not represent true barcodes. We look at the cumulative read distribution to see what could be a sensible threshold to filter the barcodes on.
We filter samples based on the read abundance determined from the cumulative distribution of read counts
#fraction threshold is 0.003 based on MEF data
threshold=0.003
#plot the cumulative read distribution to see if our read threshold is sensible
for(i in 1:ncol(norm.data)){
P = ecdf(norm.data[,i])
plot(P)
abline(v=threshold, col="blue")
}
For each sample we have a and b replicates, create a new dataframe where each value represents the summed a and b replicates
#make a file with sum of duplicates
#load data with duplicates
d <- norm.data
#add duplicates
mergedReplicates <- as.data.frame(matrix(0, ncol = (dim(d)[2]-1)/2, nrow = dim(d)[1]))
j=0
for (i in seq(1,dim(d)[2],2)) {
j=j+1
mergedReplicates[,j] <- (d[,i]+d[,i+1])
}
names(mergedReplicates) <- c("JCW26_M1_M","JCW26_M1_E",
"JCW26_M2_M","JCW26_M2_E",
"JCW26_M3_M","JCW26_M3_E",
"JCW26_M4_M","JCW26_M4_E",
"ET27a_M1_M","ET27a_M1_E",
"ET27b_M1_M","ET27b_M1_E",
"ET21_M454_M")
rownames(mergedReplicates) <- rownames(d)
We want to convert the data from a reads scale to a cell scale so we normalise based on cell numbers that we get from our cell sorting metadata. We do this for each replicate, and so perform the normalisation on the unmerged data
cell.numbers <- read.csv("/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step3_DNA_BC_processing/cell_counts.csv")
data.cell.norm <- cellNumberNormalisation(norm.data,cell.numbers,merged = FALSE)
mergedReplicates.cell.norm <- cellNumberNormalisation(mergedReplicates,cell.numbers,merged = TRUE)
Convert the data to long format to facilitate plotting
dab <- convertToLongFormat(norm.data)
dab.cellnorm <- convertToLongFormat(data.cell.norm)
#do the same but without the cell reconversion
final.bcs <- unique(dab$tag)
final.bc.matrix <- mergedReplicates[final.bcs,]
final.bc.matrix.cellnorm <- mergedReplicates.cell.norm[final.bcs,]
colSums(final.bc.matrix > 0)
## JCW26_M1_M JCW26_M1_E JCW26_M2_M JCW26_M2_E JCW26_M3_M JCW26_M3_E
## 18 15 49 61 69 54
## JCW26_M4_M JCW26_M4_E ET27a_M1_M ET27a_M1_E ET27b_M1_M ET27b_M1_E
## 35 57 300 296 174 464
## ET21_M454_M
## 157
write.csv(final.bc.matrix, "DRAG_DNA_barcodes_normalised_no_ab_filtering.csv")
write.csv(final.bc.matrix.cellnorm, "DRAG_DNA_barcodes_normalised_no_ab_filtering_cellnorm.csv")
#plot new self/self before abfiltering
qplot(asinh(vara), asinh(varb), data=dab) +
facet_wrap(~paste(mouse,expt, celltype, sep = " _ "))
#plot new self/self before abfiltering with cell normalisation
qplot(asinh(vara), asinh(varb), data=dab.cellnorm) +
facet_wrap(~paste(mouse,expt, celltype, sep = " _ "))
remove barcodes which only occur in one of the two technical replicates
dab.filtered <- dab[which(dab$vara>0 & dab$varb>0),]
dab.cellnorm.filtered <- dab.cellnorm[which(dab$vara>0 & dab$varb>0),]
final.bcs <- unique(dab.filtered$tag)
final.bc.matrix <- mergedReplicates[final.bcs,]
final.bc.matrix.cellnorm <- mergedReplicates.cell.norm[final.bcs,]
colSums(final.bc.matrix > 0)
## JCW26_M1_M JCW26_M1_E JCW26_M2_M JCW26_M2_E JCW26_M3_M JCW26_M3_E
## 13 12 34 42 46 35
## JCW26_M4_M JCW26_M4_E ET27a_M1_M ET27a_M1_E ET27b_M1_M ET27b_M1_E
## 28 30 97 122 75 81
## ET21_M454_M
## 82
write.csv(final.bc.matrix.cellnorm, "DRAG_DNA_barcodes_normalised_and_ab_filtered_cellnorm.csv")
write.csv(final.bc.matrix, "DRAG_DNA_barcodes_normalised_and_ab_filtered.csv")
colSums(final.bc.matrix > 0)
## JCW26_M1_M JCW26_M1_E JCW26_M2_M JCW26_M2_E JCW26_M3_M JCW26_M3_E
## 13 12 34 42 46 35
## JCW26_M4_M JCW26_M4_E ET27a_M1_M ET27a_M1_E ET27b_M1_M ET27b_M1_E
## 28 30 97 122 75 81
## ET21_M454_M
## 82
qplot(asinh(vara), asinh(varb), data=dab.filtered) +
facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))
qplot(asinh(vara), asinh(varb), data=dab.cellnorm.filtered) +
facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))
filter barcodes which only occur in one of the two technical replicates or if they are only in one sample then the proportional abundance of a barcode in a sample must be above a threshold value
dab.filtered2 <- dab[(dab$vara>0 & dab$varb>0 | dab$vara > threshold| dab$varb > threshold) ,]
dab.cellnorm.filtered2 <- dab.cellnorm[(dab$vara>0 & dab$varb>0 | dab$vara > threshold| dab$varb > threshold),]
final.bcs2 <- unique(dab.filtered2$tag)
final.bc.matrix2 <- mergedReplicates[final.bcs2,]
final.bc.matrix.cellnorm2 <- mergedReplicates.cell.norm[final.bcs2,]
colSums(final.bc.matrix2 > 0)
## JCW26_M1_M JCW26_M1_E JCW26_M2_M JCW26_M2_E JCW26_M3_M JCW26_M3_E
## 18 15 44 49 62 42
## JCW26_M4_M JCW26_M4_E ET27a_M1_M ET27a_M1_E ET27b_M1_M ET27b_M1_E
## 35 47 128 154 107 186
## ET21_M454_M
## 106
write.csv(final.bc.matrix2, "DRAG_DNA_barcodes_normalised_and_ab_filtered2.csv")
write.csv(final.bc.matrix.cellnorm2, "DRAG_DNA_barcodes_normalised_and_ab_filtered2_cellnorm.csv")
qplot(asinh(vara), asinh(varb), data=dab.filtered2) + facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))
qplot(asinh(vara), asinh(varb), data=dab.cellnorm.filtered2) + facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))
letās assess how well our filtering retains paired barcodes for which we have a higher confidence that they are a real barcode
bc.pairs <- read.csv("/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step6_10X_Data_integration/DNA_RNA_BC_PAIRS/all_dna_rna_bc_pairs.csv")
bc.pairs$expt <- vapply(strsplit(bc.pairs$expt_label,"_"), `[`, 1, FUN.VALUE=character(1))
bc.pairs$mouse <- vapply(strsplit(bc.pairs$expt_label,"_"), `[`, 2, FUN.VALUE=character(1))
expts <- unique(dab$expt)
for(i in 1:length(expts)){
expt <- expts[i]
dab.filtered <- dab.cellnorm.filtered2[dab.cellnorm.filtered2$expt == expt,]
bc.pairs.filtered <- bc.pairs[bc.pairs$expt == expt,]
pairs <- intersect(dab.filtered$tag,bc.pairs$all.bc.pairs)
dab.filtered$paired <- "FALSE"
dab.filtered[dab.filtered$tag %in% pairs,]$paired <- "TRUE"
#fp <- paste("/Users/jasoncosgrove/Desktop/",expt,threshold,".tiff",sep = "")
#tiff(fp, width = 6, height = 6,res = 200,units = "in")
p <- qplot(asinh(vara), asinh(varb), data=dab.filtered,colour = paired) + facet_wrap(~paste(celltype,mouse, sep = " _ "))
plot(p)
#dev.off()
}